View blog post for this code here¶

Step 1. Pre-Processing our data!¶

In [1]:
# Got to load in our dependencies!
from datetime import datetime
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
from pandas.api.types import is_numeric_dtype
from great_tables import GT, md, html, system_fonts, style, loc
In [2]:
# First, we can load in our dataframes, starting with the overall look at all of the listings within the city and its general information for 2024
nov_listings: pd.DataFrame = pd.read_csv('./datasets/new_york_listings.csv')

# For historical reasons, let's also load in the same listings dataset but from July 2023
jul_23_listings: pd.DataFrame = pd.read_csv('./datasets/NYC-Airbnb-2023.csv')
C:\Users\ernie\AppData\Local\Temp\ipykernel_21824\4071149682.py:5: DtypeWarning:

Columns (17) have mixed types. Specify dtype option on import or set low_memory=False.

In [3]:
nov_listings.head(3)
Out[3]:
id listing_url scrape_id last_scraped source name description neighborhood_overview picture_url host_id ... review_scores_communication review_scores_location review_scores_value license instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
0 2595 https://www.airbnb.com/rooms/2595 20241104040953 2024-11-04 city scrape Skylit Midtown Castle Sanctuary Beautiful, spacious skylit studio in the heart... Centrally located in the heart of Manhattan ju... https://a0.muscache.com/pictures/miso/Hosting-... 2845 ... 4.8 4.81 4.40 NaN f 3 3 0 0 0.27
1 6848 https://www.airbnb.com/rooms/6848 20241104040953 2024-11-04 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN https://a0.muscache.com/pictures/e4f031a7-f146... 15991 ... 4.8 4.69 4.58 NaN f 1 1 0 0 1.04
2 6872 https://www.airbnb.com/rooms/6872 20241104040953 2024-11-04 city scrape Uptown Sanctuary w/ Private Bath (Month to Month) This charming distancing-friendly month-to-mon... This sweet Harlem sanctuary is a 10-20 minute ... https://a0.muscache.com/pictures/miso/Hosting-... 16104 ... 5.0 5.00 5.00 NaN f 2 0 2 0 0.03

3 rows × 75 columns

In [4]:
jul_23_listings.head(3)
Out[4]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365 number_of_reviews_ltm license
0 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75356 -73.98559 Entire home/apt 150 30 49 2022-06-21 0.30 3 314 1 NaN
1 5121 BlissArtsSpace! 7356 Garon Brooklyn Bedford-Stuyvesant 40.68535 -73.95512 Private room 60 30 50 2019-12-02 0.30 2 365 0 NaN
2 5203 Cozy Clean Guest Room - Family Apt 7490 MaryEllen Manhattan Upper West Side 40.80380 -73.96751 Private room 75 2 118 2017-07-21 0.72 1 0 0 NaN
In [5]:
# I like to do this because sometimes the columns have whitespace or weird capitalization you don't even realize
nov_listings.columns = np.vectorize(lambda x: x.strip().lower())(nov_listings.columns)

# Our first pre-processing step can just be dropping columns we definitely, 100% don't need for our analysis
print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings.drop(
    columns=['picture_url', 
             'host_url',
             'neighbourhood', #Not really the neighborhood 
             'host_thumbnail_url', 
             'host_picture_url', 
             'host_has_profile_pic', 
             'host_identity_verified',
             'license',
             ],
    inplace = True
)

# Take the dollar sign out of the price column so we can do some analysis with it
nov_listings['price'] = nov_listings["price"].apply(
    lambda x: float(x.replace('$', '').replace(',','') if isinstance(x, str) else x)
    )

print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings currently has 75 columns
nov_listings currently has 67 columns
In [6]:
# Can do the same for the 2023 listings, although there is a lot less data
jul_23_listings.columns = np.vectorize(lambda x: x.strip().lower())(jul_23_listings.columns)

print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings.drop(
        columns=['license', 'number_of_reviews_ltm'],
        inplace=True)

print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings currently has 18 columns
jul_23_listings currently has 16 columns
In [7]:
# Now, let's take a look at NaN values for each dataframe
print(f"""The number of NaN values per column in nov_listings: \n
{nov_listings.isna().sum().sort_values(ascending=False)[:11]}'
    """
    )

print(f"""
    'The number of NaN values per column in jul_23_listings: \n
      {jul_23_listings.isna().sum().sort_values(ascending=False)}
    """
    )

# That's helpful, but doesn't really give me a gauge on what percentage of rows this is, so let's calculate that!
nov_nan_percentages: pd.DataFrame = pd.DataFrame((nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).round(2), columns=['Missing Value Percentage']).reset_index(names=['Column Name'])

nov_nan_table: GT = (
    GT(nov_nan_percentages[:10])
    .tab_header(
        title = html(
            "<span style='font-size:20px; font-weight:bold;'>Missing values for the November 2024 Table</span>"
            ),
        subtitle = html(
            "<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"
            ),
    )
    .tab_options(
        table_font_names=system_fonts("industrial")
    )
    .data_color(
        columns=['Missing Value Percentage'],
        palette = 'RdPu',
    )
)

jul_nan_percentages: pd.DataFrame = pd.DataFrame(
    (
        jul_23_listings.isna().sum().sort_values(ascending=False)[:20] 
        / len(jul_23_listings.index) * 100
    ).round(2), 
    columns=['Missing Value Percentage']).reset_index(names=['Column Name'])

jul_nan_table: GT = (
    GT(jul_nan_percentages[:10])
    .tab_header(
        title = html(
            "<span style='font-size:20px; font-weight:bold;'>Missing values for the July 2023 Table</span>"
            ),
        subtitle = html(
            "<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"
            ),
    )
    .tab_options(
        table_font_names=system_fonts("industrial"),
    )
    .data_color(
        columns=['Missing Value Percentage'],
        palette = 'RdPu',
    )
)

# Save our tables to a file
nov_nan_table.save(file='../img/November 2024 NaN Table.png', scale = 16)
jul_nan_table.save(file='../img/July 2023 NaN Table.png', scale = 16)

nov_nan_table.show()
jul_nan_table.show()
The number of NaN values per column in nov_listings: 

calendar_updated          37548
neighborhood_overview     16974
host_about                16224
host_response_rate        15001
host_response_time        15001
host_acceptance_rate      14983
beds                      14952
bathrooms                 14809
price                     14807
review_scores_location    11578
review_scores_value       11577
dtype: int64'
    

    'The number of NaN values per column in jul_23_listings: 

      last_review                       10304
reviews_per_month                 10304
name                                 12
host_name                             5
neighbourhood_group                   0
neighbourhood                         0
id                                    0
host_id                               0
longitude                             0
latitude                              0
room_type                             0
price                                 0
number_of_reviews                     0
minimum_nights                        0
calculated_host_listings_count        0
availability_365                      0
dtype: int64
    
Missing values for the November 2024 Table
Sorted by the Percentage of the Column Missing
Column Name Missing Value Percentage
calendar_updated 100.0
neighborhood_overview 45.21
host_about 43.21
host_response_rate 39.95
host_response_time 39.95
host_acceptance_rate 39.9
beds 39.82
bathrooms 39.44
price 39.43
review_scores_location 30.84
Missing values for the July 2023 Table
Sorted by the Percentage of the Column Missing
Column Name Missing Value Percentage
last_review 24.0
reviews_per_month 24.0
name 0.03
host_name 0.01
neighbourhood_group 0.0
neighbourhood 0.0
id 0.0
host_id 0.0
longitude 0.0
latitude 0.0
In [8]:
# First, let's get rid of that pesky calendar_updated column
nov_listings.drop(columns=['calendar_updated'], inplace=True)

# Let's get all the numerical columns with more than 30% of their values missing
missing_columns = [name for name, val in (nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).items() if val > .3 and is_numeric_dtype(nov_listings[name])]

# Create a function that can replace values in a column based for each borough
def fill_na_with_group_means(df: pd.DataFrame, col: str, group_col: str = 'neighbourhood_group_cleansed') -> pd.Series:
    """ Returns a dictionary with the median for the grouped column that can be used to fill NaN values

    Args:
        df (pd.DataFrame): dataframe to utilize
        col (str): column to take the median of 
        group_col (str, optional): column to group by Defaults to 'neighbourhood_group_cleansed'.

    Returns:
        pd.Series: series with the indexes as the grouped_by indexes and the values as the medians of each group for the specified column
    """
    # print(df.groupby(group_col)[col].transform('median'))
    return df[col].fillna(df.groupby(group_col)[col].transform('median'))

# Do it for every missing column
for col in missing_columns:
    nov_listings[col] = fill_na_with_group_means(nov_listings, col)

From here, we would typically do the same for jul_23_listings, but based upon the analysis above, there aren't many important columns for null values at all, so we can leave that.

Step 2. Visualizations¶

In [9]:
# Set plotly to offline mode so we can display these visualizations
pyo.init_notebook_mode()
In [10]:
# For our first visualization, it might be helpful to look at for each borough how the number of listings have changed from 2023 to now
num_nov_of_listings: pd.Series = nov_listings.groupby('neighbourhood_group_cleansed').size()

num_jul_23_listings: pd.Series = jul_23_listings.groupby('neighbourhood_group').size()

total_diff: float = ((len(nov_listings) - len(jul_23_listings)) / len(jul_23_listings)) * -100

listings_change: pd.DataFrame = pd.DataFrame({
    'borough': num_nov_of_listings.index,
    'July 2023': num_jul_23_listings,
    'November 2024': num_nov_of_listings
})

listings_change['percent_change'] = ((listings_change['November 2024'] - listings_change['July 2023']) / listings_change['July 2023']) * 100

listings_change_fig = px.bar(
    data_frame = listings_change, 
    x = 'borough', 
    template='plotly_dark', 
    y = ['July 2023', 'November 2024'],
    barmode='group', 
    labels = {'borough': 'Borough', 'variable': 'Month'}, 
    color_discrete_map={'July 2023': '#a1c9f4', 'November 2024': '#8de5a1'} 
)

# Update layout
listings_change_fig.update_layout(
    yaxis = dict(title=dict(text='Number of Airbnb Listings')),
    font_family = "Raleway, sans-serif",
    title = dict(text=f'<b>How Local Law 18 changed the number of Airbnbs across NYC boroughs</b><br><sup>The number across the city decreased by {total_diff:.2f}% but that decrease varied across boroughs</sup>'),
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

# Add percent change information above each bar
for i, row in listings_change.iterrows():
    listings_change_fig.add_trace(go.Scatter(
        x=[row['borough']],
        y = [max(row['July 2023'], row['November 2024']) + 15],
        text=[f"{row['percent_change']:.2f}%"],
        mode="text",
        showlegend=False,
        textfont = dict(weight=600, size = 15)
    ))

# Have to do this weird thing where we save the image and then display it within the notebook because plotly graphs mess up my blog lol
listings_change_fig.write_image("../img/listings_change.png", scale=6, engine="kaleido")

Listings Change

In [11]:
jul_listings_change_map = px.scatter_map(
    data_frame = jul_23_listings,
    lat = 'latitude',
    lon = 'longitude',
    color = 'neighbourhood_group',
    map_style = 'carto-darkmatter',
    labels = {'neighbourhood_group': 'Borough'},
    opacity = .5,
    size_max = 10,
    title = 'Geographic Distribution of Airbnbs Across New York City: July 2023',
    zoom = 9
)

jul_listings_change_map.update_layout(
    font_family = "Raleway, sans-serif",
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

nov_listings_change_map = px.scatter_map(
    data_frame = nov_listings,
    lat = 'latitude',
    lon = 'longitude',
    color = 'neighbourhood_group_cleansed',
    labels = {'neighbourhood_group_cleansed': 'Borough'},
    map_style = 'carto-darkmatter',
    opacity = .5,
    size_max = 10,
    title = 'Geographic Distribution of Airbnbs Across New York City: November 2024',
    zoom = 9
)

nov_listings_change_map.update_layout(
    font_family = "Raleway, sans-serif",
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

# Save these maps to html so we can display them on the website
jul_listings_change_map.write_html('../plotly/jul_2023_airbnb_map.html')
nov_listings_change_map.write_html('../plotly/nov_2024_airbnb_map.html')

# Save these maps to pngs so we can display them within the notebook
jul_listings_change_map.write_image('../img/jul_listings_change_map.png', scale = 6, engine = 'kaleido')
nov_listings_change_map.write_image('../img/nov_listings_change_map.png', scale = 6, engine = 'kaleido')

# jul_listings_change_map.show()
# nov_listings_change_map.show()

July 2023 Listings

November 2024 Listings

That's some pretty cool insight, but the number of boroughs doesn't simply tell the entire story. How about the average prices? Let's explore that!

In [12]:
avg_price_nov_listings: pd.Series = nov_listings.groupby('neighbourhood_group_cleansed')['price'].mean()

avg_price_jul_23_listings: pd.Series = jul_23_listings.groupby('neighbourhood_group')['price'].mean()

avg_price_change: pd.DataFrame = pd.DataFrame({
    'borough': avg_price_nov_listings.index,
    'July 2023': avg_price_jul_23_listings,
    'November 2024': avg_price_nov_listings,
})

avg_price_change['percent_change'] = ((avg_price_change['November 2024'] - avg_price_change['July 2023']) / avg_price_change['July 2023']) * 100

avg_price_fig = px.bar(
    data_frame = avg_price_change,
    x = 'borough',
    y = ['July 2023', 'November 2024'],
    barmode = 'group',
    template = 'plotly_dark',
    labels = {'borough': 'Borough', 'variable': 'Month'}, 
    color_discrete_map={'July 2023': '#d0bbff', 'November 2024': '#fab0e4'}
)

# Update layout
avg_price_fig.update_layout(
    yaxis = dict(title=dict(text='Average Price of Airbnb Listing ($)')),
    font_family = "Raleway, sans-serif",
    title = dict(text=f'<b>How Local Law 18 changed the price of Airbnbs across NYC boroughs</b><br><sup>Across the board, the prices of Airbnbs decreased despite the stark decrease in options available</sup>'),
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

# Add percent change information above each bar
for i, row in avg_price_change.iterrows():
    avg_price_fig.add_trace(go.Scatter(
        x=[row['borough']],
        y = [row['July 2023'] + 10],
        text=[f"{row['percent_change']:.2f}%"],
        mode="text",
        showlegend=False,
        textfont = dict(weight=600,size = 15)
    ))

avg_price_fig.write_image("../img/avg_price_change.png", scale=6, engine="kaleido")

Average Price Change

In [13]:
# We will load a dataset from 2020 to help us drive this analysis
sep_2020_listings: pd.DataFrame = pd.read_csv('./datasets/sep_2020_listings.csv')

sep_2020_listings.head(5)
Out[13]:
id name summary description experiences_offered neighborhood_overview transit house_rules host_id host_since ... hot_tub_sauna_or_pool internet long_term_stays pets_allowed private_entrance secure self_check_in smoking_allowed accessible event_suitable
0 2539 Clean & quiet apt home by the park Renovated apt home in elevator building. Renovated apt home in elevator building. Spaci... none Close to Prospect Park and Historic Ditmas Park Very close to F and G trains and Express bus i... -The security and comfort of all our guests is... 2787 39698.0 ... -1 1 1 -1 -1 1 1 -1 1 1
1 3647 THE VILLAGE OF HARLEM....NEW YORK ! NaN WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T... none NaN NaN Upon arrival please have a legibile copy of yo... 4632 39777.0 ... -1 1 -1 -1 -1 -1 -1 -1 -1 -1
2 7750 Huge 2 BR Upper East Cental Park NaN Large Furnished 2BR one block to Central Park... none NaN NaN NaN 17985 39953.0 ... -1 1 -1 1 -1 -1 -1 -1 -1 -1
3 8505 Sunny Bedroom Across Prospect Park Just renovated sun drenched bedroom in a quiet... Just renovated sun drenched bedroom in a quiet... none Quiet and beautiful Windsor Terrace. The apart... Ten minutes walk to the 15th sheet F&G train s... - No shoes in the house - Quiet hours after 11... 25326 40006.0 ... -1 1 -1 -1 -1 -1 -1 -1 -1 -1
4 8700 Magnifique Suite au N de Manhattan - vue Cloitres Suite de 20 m2 a 5 min des 2 lignes de metro a... Suite de 20 m2 a 5 min des 2 lignes de metro a... none NaN Metro 1 et A NaN 26394 40014.0 ... -1 1 -1 -1 -1 -1 -1 -1 -1 -1

5 rows × 81 columns

In [14]:
def convert_excel_serial_to_date(col: pd.Series) -> pd.Series:
    """
    Convert an Excel-style serial date to a readable date format.

    Args:
        col (pd.Series): Numeric value representing a date in Excel format.
    Returns:
        pd.Series: Corresponding date in YYYY-MM-DD format.
    """
    try:
        return pd.to_datetime(col, origin='1899-12-30', unit='D')
    except:
        return None  # Return None for invalid values
In [15]:
# Convert the host_since column here to an actual date (for some reason it's in some weird Excel format)
sep_2020_listings['host_since'] = sep_2020_listings['host_since'].apply(convert_excel_serial_to_date)

sep_2020_listings['host_since']
Out[15]:
0       2008-09-07
1       2008-11-25
2       2009-05-20
3       2009-07-12
4       2009-07-20
           ...    
30174   2016-12-18
30175   2013-08-18
30176   2013-05-25
30177   2015-04-10
30178   2016-04-20
Name: host_since, Length: 30179, dtype: datetime64[ns]
In [16]:
def calculate_age_of_host(col: pd.Series, year: int, month: int = 1) -> pd.Series:
    """
        Calculate the amount of time has elapsed in years for a given column

    Args:
        col (pd.Series): Series of the dates (in datetime format)
        Year (int): Year to calculate the date from
        month (int, optional): Month to calculate the datetime with. Defaults to 1.

    Returns:
        pd.Series: series with the indexes as the grouped_by indexes and the values as the medians of each group for the specified column
    """
    
    # Convert the year in which the data was collected to datetime
    reference_date = datetime(year, month, 1)

    # Make sure the column given is in datetime
    col = pd.to_datetime(col)

    # Calculate the amount of days that have elapsed
    elapsed_time: pd.Series = reference_date - col

    elapsed_years: pd.Series = elapsed_time.dt.days / 365.25

    return elapsed_years
In [46]:
# Get the hosting age for November 2024
nov_listings['hosting_age'] = calculate_age_of_host(nov_listings['host_since'].dropna(), 2024, 11)
nov_average_age: np.float64 = np.average(nov_listings['hosting_age'].dropna())

# Get the hosting age for September 2020
sep_2020_listings['hosting_age'] = calculate_age_of_host(sep_2020_listings['host_since'].dropna(), 2020, 9)
sep_2020_average_age: np.float64 = np.average(sep_2020_listings['hosting_age'].dropna())
In [52]:
NUM_OF_BINS = 25

RAINBOW_COLORS = ["#ffadad","#ffd6a5","#fdffb6","#caffbf","#9bf6ff","#a0c4ff","#bdb2ff","#ffc6ff"]

RAINBOW_COLOR_SEQ = [RAINBOW_COLORS[i % len(RAINBOW_COLORS)] for i in range(NUM_OF_BINS)]

nov_listings_hosting_age = px.histogram(
    data_frame = nov_listings,
    x = 'hosting_age',
    nbins = NUM_OF_BINS,
    template = 'plotly_dark',
    labels = {'count': 'Number of Airbnbs', 'hosting_age': 'Age of Airbnb (years)'},
)

nov_listings_hosting_age.update_layout(
    font_family = "Raleway, sans-serif",
    title = dict(text=f'<b>Hosting Age of Airbnb Hosts in November 2024</b><br><sup>In November 2024, the average age of a listing in New York City was {nov_average_age:.2f}</sup>'),
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

for bar in nov_listings_hosting_age.data:
    bar.marker.color = RAINBOW_COLOR_SEQ



# nov_listings_hosting_age.show()

November 2024 Hosting Age

In [51]:
sep_2020_listings_hosting_age = px.histogram(
    data_frame = sep_2020_listings,
    x = 'hosting_age',
    nbins = NUM_OF_BINS,
    template = 'plotly_dark',
    labels = {'count': 'Number of Airbnbs', 'hosting_age': 'Age of Airbnb (years)'},
)

sep_2020_listings_hosting_age.update_layout(
    font_family = "Raleway, sans-serif",
    title = dict(text=f'<b>Hosting Age of Airbnb Hosts in September 2020</b><br><sup>In September 2020, the average age of a listing in New York City was {sep_2020_average_age:.2f}</sup>'),
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

for bar in sep_2020_listings_hosting_age.data:
    bar.marker.color = RAINBOW_COLOR_SEQ

# sep_2020_listings_hosting_age

September 2020 Host Age

In [50]:
# Export these histograms to images
nov_listings_hosting_age.write_image('../img/nov_host_age.png', scale = 6, engine = 'kaleido')
sep_2020_listings_hosting_age.write_image('../img/sep_2020_host_age.png', scale = 6, engine = 'kaleido')
In [ ]: